{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dc6cbc97-1844-4062-888b-ee0ca2b5369c",
   "metadata": {},
   "source": [
    "# Parse \"TABLE OF RECOMMENDED NUCLEAR MAGNETIC DIPOLE MOMENTS: PART I, LONG-LIVED STATES\"\n",
    "\n",
    "source: https://www-nds.iaea.org/publications/indc/indc-nds-0794/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "570b47ac-ffd7-436d-988b-693fe99f936c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from fractions import Fraction\n",
    "import re\n",
    "\n",
    "from camelot.io import read_pdf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "314d487f-a207-4b9d-8dac-dc072e5e682b",
   "metadata": {},
   "outputs": [],
   "source": [
    "pl.Config.set_tbl_rows(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1c508ba-358e-4764-9ee3-c090b139ca6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = [\n",
    "    \"Nucleus\",\n",
    "    \"Ex\",\n",
    "    \"T1/2\",\n",
    "    \"Jπ\",\n",
    "    \"m(nm)\",\n",
    "    \"Method\",\n",
    "    \"NSR Keynumber\",\n",
    "    \"Journal Reference\",\n",
    "]\n",
    "\n",
    "# read and merge into a single dataframe\n",
    "tabs = read_pdf(\"indc-nds-0794.pdf\", pages=\"13-43\", flavor=\"lattice\")\n",
    "df = pd.concat([t.df.replace(\"\", None).dropna(axis=\"index\", how=\"all\") for t in tabs])\n",
    "\n",
    "# drop the first row and set column names\n",
    "df = df.iloc[1:]\n",
    "df.columns = columns\n",
    "\n",
    "df.to_parquet(\"raw_parse.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4f212c8-7d61-48ba-a81a-988ff9dca94e",
   "metadata": {},
   "source": [
    "## Process extracted table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e9b67b7-617f-4369-8668-83d9cf6e0028",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.from_pandas(pd.read_parquet(\"raw_parse.parquet\"))\n",
    "df = df.drop([\"T1/2\", \"Method\", \"NSR Keynumber\", \"Journal Reference\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb227827-b811-426e-a523-bf150bcf37f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop rows for antiproton and neutron\n",
    "df = df.filter(pl.col(\"Nucleus\").is_in([\"0 n 1\", \"antiproton\"]).not_())\n",
    "\n",
    "# drop excited statesand keep only ground states\n",
    "df = df.filter(pl.col(\"Ex\") == \"0\")\n",
    "\n",
    "# get atomic and mass number and symbol\n",
    "pattern = r\"(\\d+)\\s*([A-Za-z]+)\\s*(\\d+)\"\n",
    "df = df.with_columns(\n",
    "    captures=pl.col(\"Nucleus\").str.extract_groups(pattern)\n",
    ").with_columns(\n",
    "    atomic_number=pl.col(\"captures\").struct[\"1\"].cast(pl.Int64),\n",
    "    symbol=pl.col(\"captures\").struct[\"2\"],\n",
    "    mass_number=pl.col(\"captures\").struct[\"3\"].cast(pl.Int64),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e878f8a0-d149-4b74-aa1c-a1c24862a887",
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse spin and parity\n",
    "spin_re = r\"(?P<spin>\\d+/\\d+|\\d+)(?P<parity>[+-]?)\"\n",
    "df = df.with_columns(captures=pl.col(\"Jπ\").str.extract_groups(spin_re)).unnest(\n",
    "    \"captures\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8726e022-0d8d-454d-80a6-8cdc7660d3d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse nuclear magnetic dipole moments and uncertainties\n",
    "val_unc_re = r\"(?P<value_sign>[+-])?(?P<value_lead>\\d+)\\.?(?P<value_decimals>\\d+)?\\s*\\(?(?P<value_unc>\\d+(?:\\.\\d+)?)\\)?\"\n",
    "df = (\n",
    "    df.with_columns(captures=pl.col(\"m(nm)\").str.extract_groups(val_unc_re))\n",
    "    .unnest(\"captures\")\n",
    "    .with_columns(\n",
    "        value_precision=pl.col(\"value_decimals\").str.len_chars().cast(pl.Int32),\n",
    "    )\n",
    "    .with_columns(\n",
    "        magnetic_moment=pl.concat_str(\n",
    "            pl.col(\"value_sign\"),\n",
    "            pl.col(\"value_lead\"),\n",
    "            pl.lit(\".\"),\n",
    "            pl.col(\"value_decimals\"),\n",
    "            ignore_nulls=True,\n",
    "        ).cast(pl.Float32),\n",
    "        magnetic_moment_unc=pl.lit(10.0, dtype=pl.Float32).pow(\n",
    "            -pl.col(\"value_precision\").cast(pl.Float32)\n",
    "        )\n",
    "        * pl.col(\"value_unc\").cast(pl.Float32),\n",
    "    )\n",
    ")\n",
    "\n",
    "# compute g-factor\n",
    "df = df.with_columns(\n",
    "    g_factor=pl.col(\"magnetic_moment\")\n",
    "    / pl.col(\"spin\").map_elements(lambda x: Fraction(x), return_dtype=pl.Float32),\n",
    "    g_factor_uncertainty=pl.col(\"magnetic_moment_unc\")\n",
    "    / pl.col(\"spin\").map_elements(lambda x: Fraction(x), return_dtype=pl.Float32),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "704ada9e-0bad-44ae-bb92-a09e7e120651",
   "metadata": {},
   "outputs": [],
   "source": [
    "new = df.select(\n",
    "    [\n",
    "        \"atomic_number\",\n",
    "        \"mass_number\",\n",
    "        \"symbol\",\n",
    "        \"spin\",\n",
    "        \"parity\",\n",
    "        \"m(nm)\",\n",
    "        \"magnetic_moment\",\n",
    "        \"g_factor\",\n",
    "        \"g_factor_uncertainty\",\n",
    "        \"magnetic_moment_unc\",\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f92f303-9163-4cd7-b654-06f7ad2abd06",
   "metadata": {},
   "outputs": [],
   "source": [
    "new"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d98f5959-f0a4-4e2c-8e51-9b95b440d452",
   "metadata": {},
   "source": [
    "## Fetch isotope table for comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e725f2bc-6238-43c1-9657-172b0512efc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mendeleev.fetch import fetch_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c285b55-6c0f-4d03-a6af-945824b0092a",
   "metadata": {},
   "outputs": [],
   "source": [
    "isotopes = pl.from_pandas(fetch_table(\"isotopes\"))\n",
    "isotopes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2c4bb8c-977f-44cb-a748-728b782b20ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "new.with_columns(\n",
    "    pl.col(\"atomic_number\").cast(pl.Int64), pl.col(\"mass_number\").cast(pl.Int64)\n",
    ").join(isotopes, on=[\"atomic_number\", \"mass_number\"], suffix=\"_true\").select(\n",
    "    [\n",
    "        \"atomic_number\",\n",
    "        \"mass_number\",\n",
    "        \"symbol\",\n",
    "        \"g_factor\",\n",
    "        \"g_factor_true\",\n",
    "        \"parity\",\n",
    "        \"parity_true\",\n",
    "    ]\n",
    ").filter(pl.col(\"g_factor_true\").is_null())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "835fe9ed-b096-4ca7-81ae-4baccc55f65c",
   "metadata": {},
   "source": [
    "## Update values in mendleev db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f5b8f5c-eea5-4ea6-b2fc-bb01b4009637",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mendeleev.db import get_session, get_engine\n",
    "from mendeleev.models import Isotope"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1aa094dd-4116-4a09-b79c-2145ee6db0a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "session = get_session(read_only=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1180fa9e-0461-495b-9f66-a0c58e58b2cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "fields = {\"spin\", \"parity\", \"g_factor\", \"g_factor_uncertainty\"}\n",
    "for row in new.iter_rows(named=True):\n",
    "    iso = (\n",
    "        session.query(Isotope)\n",
    "        .filter_by(atomic_number=row[\"atomic_number\"], mass_number=row[\"mass_number\"])\n",
    "        .update({k: v for k, v in row.items() if k in fields})\n",
    "    )\n",
    "    # session.rollback()\n",
    "    session.commit()\n",
    "session.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "678cdd3e-14dd-49a4-9a46-4695b79ad83d",
   "metadata": {},
   "source": [
    "## Validate that new value are available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18a709f1-705b-4bc9-b5ac-9de5f638c377",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mendeleev import K, Ti, Cr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "607b863a-5b97-49fc-84cf-5675edc367c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "K.isotope(40).g_factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39b34f5d-a35d-4bd7-9ea3-b7fba48a06ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "Ti.isotope(47).g_factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc9f4770-12ec-455e-a770-83deac96c4a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "Ti.isotope(49).g_factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99c06982-f0ab-4ed6-a4c4-58436fc85d86",
   "metadata": {},
   "outputs": [],
   "source": [
    "Cr.isotope(53).g_factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8235c32-cb32-48f2-8cad-d8efd4b4bd63",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
